__kernel void pairs_sum(__global int* a_in,
                           __global int* b_out)
{
    __local int i_mid[2];

    int gid = get_global_id(0);
    int lid = get_local_id(0);
    size_t gsize = get_global_size(0);
    size_t lsize = get_local_size(0);

    i_mid[lid] = a_in[gid-lid] + a_in[(gid-lid)+1];

    barrier(CLK_LOCAL_MEM_FENCE);

    b_out[gid] = i_mid[lid] + i_mid[(lid+1) % lsize];
}
